Machine Learning Approaches for Ammonia Volatilization Prediction after Manure Application

Authors

Armand Favrot

Sophie Génermont

Céline Decuq

Jean-Marc Giliot

David Makowski

library ("readxl")
library ("ggplot2") # version 3.4.4
library ("gridExtra") # version 2.3
library ("dplyr") # version 1.1.2      
library ("paletteer") # version 1.4.0
library ("tibble") # version 3.2.1
library ("tidyr") # version 1.3.0
library ("ALFAM2") # version 3.7
library ("randomForest") # version 4.7.1.1
library ("xgboost") # version 1.7.6.1
library ("glmnet") # version 4.1.8
library ("treeshap") # version 0.3.1
library ("shapviz") # version 0.9.3
library ("kableExtra") # version 1.3.4
axis_text_size = 22 
axis_title_size = 26
title_size = 26

theme_replace(
    axis.text = element_text(size = axis_text_size), 
    axis.title.x = element_text(size = axis_title_size, angle = 0, margin = ggplot2::margin (t = 10)), 
    axis.title.y = element_text(size = axis_title_size, angle = 90, margin = ggplot2::margin (r = 10)), 
    plot.title = element_text(size = title_size, hjust = 0.5, face = "bold"), 
    legend.text = element_text(size = axis_title_size), 
    legend.title = element_text(size = axis_title_size, face = "bold"), 
    strip.text = element_text(size = axis_title_size, face = "bold", color = "white"), 
    strip.background = element_rect(fill = "#000080"), 
    panel.background = element_rect(fill = "#f3faff"), panel.grid.major = element_line(colour = "white"), 
    panel.spacing = unit(2, "lines")
)

options(ggplot2.discrete.fill = list("#67c5a7"))

Dark2 = paletteer_d("RColorBrewer::Dark2")

Data description

load (file = "scripts/processed_data/data_alfam2.Rdata")
data_alfam2 %>% summarise (n = n(), .by = dataset)
             dataset    n
1 Calibration subset 5515
2  Evaluation subset  424
plot_data_description = data_alfam2 %>%

    filter (! (app.mthd == "bsth" & incorp == "shallow")) %>%

    mutate (strategy = paste (app.mthd, incorp, sep = " - ")) %>%
    mutate (strategy = recode (strategy, "bc - none" = "A", "bc - shallow" = "B", "bc - deep" = "C", "bsth - none" = "D", "os - none" = "E", "ts - none" = "F")) %>%

    ggplot () +
        geom_line (aes (x = time, y = e.rel, group = pmid), linewidth = 0.15) +
        ylab ("Cumul of relative emission") +
        xlab ("Time (h)") +
        theme (axis.title.y = element_text (margin = ggplot2::margin (r = 25)),
               strip.text.x = element_text(margin = ggplot2::margin(t = 8, b = 8, r = 0, l = 0))) +
        # scale_color_manual (values = Dark2) +
        facet_wrap (~ strategy)

plot_data_description

data_alfam2 %>%

    select (air.temp, wind.2m, rain.rate, rain.cum) %>%

    summary 
    air.temp        wind.2m           rain.rate         rain.cum     
 Min.   :-1.90   Min.   : 0.05133   Min.   :0.0000   Min.   : 0.000  
 1st Qu.: 9.50   1st Qu.: 1.65665   1st Qu.:0.0000   1st Qu.: 0.000  
 Median :12.50   Median : 2.80000   Median :0.0000   Median : 0.000  
 Mean   :13.28   Mean   : 3.08744   Mean   :0.0406   Mean   : 1.131  
 3rd Qu.:16.85   3rd Qu.: 4.08000   3rd Qu.:0.0000   3rd Qu.: 0.400  
 Max.   :35.24   Max.   :16.78400   Max.   :7.1000   Max.   :55.900  
data_alfam2 %>% 

    filter (time == max (time), .by = pmid) %>%

    select (e.cum, time, tan.app, app.rate, man.dm, man.ph, t.incorp) %>%

    summary 
     e.cum              time          tan.app          app.rate     
 Min.   :  0.264   Min.   :20.75   Min.   : 10.90   Min.   :  6.60  
 1st Qu.:  5.815   1st Qu.:48.16   1st Qu.: 36.66   1st Qu.: 18.73  
 Median : 10.968   Median :70.30   Median : 54.28   Median : 28.40  
 Mean   : 16.162   Mean   :62.13   Mean   : 62.07   Mean   : 29.76  
 3rd Qu.: 21.473   3rd Qu.:71.70   3rd Qu.: 80.30   3rd Qu.: 36.65  
 Max.   :140.570   Max.   :78.00   Max.   :235.40   Max.   :132.60  
                                                                    
     man.dm           man.ph        t.incorp      
 Min.   : 1.000   Min.   :6.40   Min.   : 0.0000  
 1st Qu.: 3.790   1st Qu.:7.10   1st Qu.: 0.0000  
 Median : 6.755   Median :7.40   Median : 0.0833  
 Mean   : 6.248   Mean   :7.42   Mean   : 1.1878  
 3rd Qu.: 8.150   3rd Qu.:7.70   3rd Qu.: 0.0833  
 Max.   :13.600   Max.   :8.50   Max.   :48.0000  
                  NA's   :85     NA's   :475      

Part 1 - Model comparison

Alfam2

# We use alfam2pars01 parameter without pH, like in Hafner et al, 2019
pars <- alfam2pars01[!grepl('man.ph', names(alfam2pars01))]

alfam2_predictions =  alfam2 (
        
        pars = pars,
    
        dat = data_alfam2 %>% select (- j.NH3, - e.cum, - e.rel, - dataset, - country),
        
        app.name = "tan.app",
        
        time.name = "time",
    
        time.incorp = "t.incorp",
        
        group = "pmid",
    
        prep = TRUE,
        
        warn = FALSE
)

We add the true values to the prediction dataframe and we keep only the last observation of each pmid.

alfam2_predictions = alfam2_predictions %>%

        select (pmid, time, e, er) %>%

        mutate (truth_e = data_alfam2$e.cum) %>%
        mutate (truth_er = data_alfam2$e.rel) %>%

        mutate (dataset = data_alfam2$dataset,
                country = data_alfam2$country,
                app.mthd = data_alfam2$app.mthd) %>%
        
        select (pmid, time, e, er, truth_e, truth_er, dataset, country, app.mthd) %>% 

        filter (time == max (time), .by = pmid)
alfam2_predictions %>%
    mutate (app.mthd = recode (app.mthd, "bc" = "Broadcast", "bsth" = "Trailing hose", "ts" = "Trailing shoe", "os" = "Open slot injection")) %>%
    mutate (app.mthd = factor (app.mthd, levels = c ("Broadcast", "Trailing hose", "Trailing shoe", "Open slot injection"))) %>%
    mutate (residuals = truth_er - er) %>%
    pivot_longer (cols = c ("er", "truth_er", "residuals")) %>%
    mutate (name = recode (name, "er" = "prediction", "truth_er" = "truth")) %>%
    mutate (name = factor (name, levels = c ("truth", "prediction", "residuals"))) %>%
    ggplot () +
        geom_boxplot (aes (x = country, y = value, fill = name)) +
        geom_hline (yintercept = 0, linetype = 2) +
        facet_wrap (~ app.mthd, ncol = 1) +
        ylab ("Relative 72h cum. emission or residual") +
        labs (fill = "") +
        theme (legend.position = "bottom",
               axis.title.y = element_text (margin = ggplot2::margin (r = 30)),
               strip.text.x = element_text(margin = ggplot2::margin(t = 8, b = 8, r = 0, l = 0)))

Random forest

load (file = "scripts/processed_data/data_random_forest.Rdata")
data_random_forest_calibration = data_random_forest %>% 
    filter (dataset == "Calibration subset") %>%
    select (- pmid, - e.rel, - dataset, - country)

set.seed (123)
random_forest_model = randomForest (
    
        e.cum ~ ., 

        data = data_random_forest_calibration,

        importance = TRUE,
    
        mtry = 19, nodesize = 3, replace = FALSE, sample_frac = 0.8
    
)
plot (random_forest_model)

random_forest_predictions = data_random_forest %>%

    mutate (e.cum_hat = predict ( 
                random_forest_model,                       
                newdata = data_random_forest %>% 
                    select (- pmid, - e.cum, - e.rel, - dataset, - country)
            )
    ) %>%

    mutate (e.rel_hat = e.cum_hat / tan.app) %>%

    mutate (app.mthd = recode (as.character (app.mthd), "1" = "bc", "2" = "ts", "3" = "os", "4" = "bsth")) %>%

    select (pmid, time, e = e.cum_hat, er = e.rel_hat, truth_e = e.cum, truth_er = e.rel, dataset, country, app.mthd)    

Xgboost

load (file = "scripts/processed_data/data_xgboost.Rdata")
set.seed (123)
xgboost_model = xgboost (  

    data = xgb.DMatrix (

        data = data_xgboost %>% 
            filter (dataset == "Calibration subset") %>%
            select (- pmid, - e.cum, - e.rel, - dataset, - country) %>%
            as.matrix %>%
            {.}, 

        label = data_xgboost %>% 
            filter (dataset == "Calibration subset") %>%
            select (e.cum) %>% 
            as.matrix %>%
            {.}

    ),

    max.depth = 6, nrounds = 300, eta = 0.3, min_child_weight = 0.5, subsample = 0.8,

    verbose = FALSE,
    
    objective = "reg:squarederror"

)
xgboost_predictions = data_xgboost %>%

    mutate (e.cum_hat = predict (
        
                xgboost_model, 
                newdata = data_xgboost %>% 
                        select (- pmid, - e.cum, - e.rel, - dataset, - country) %>% 
                        as.matrix
            )
    ) %>%

    mutate (e.rel_hat = e.cum_hat / tan.app) %>%

    mutate (app.mthd = recode (as.character (app.mthd), "1" = "bc", "2" = "ts", "3" = "os", "4" = "bsth")) %>%

    select (pmid, time, e = e.cum_hat, er = e.rel_hat, truth_e = e.cum, truth_er = e.rel, dataset, country, app.mthd)

Lasso

load (file = "scripts/processed_data/data_lasso.Rdata")
set.seed (123)
lasso_model = cv.glmnet (  

    x = data_lasso %>% 
        filter (dataset == "Calibration subset") %>%
        select (- pmid, - e.cum, - e.rel, - dataset, - country) %>%
        as.matrix %>%
        {.}, 

    y = data_lasso %>% 
        filter (dataset == "Calibration subset") %>%
        select (e.cum) %>% 
        as.matrix %>%
        {.},

    alpha = 1
)
lasso_predictions_vector = predict (
        
    lasso_model, 

    data_lasso %>% 
            select (- pmid, - e.cum, - e.rel, - dataset, - country) %>% 
            as.matrix
)


df_join = data_alfam2 %>% select (pmid, app.mthd) %>% distinct

lasso_predictions = data_lasso %>%

    left_join (df_join, by = "pmid") %>%

    mutate (e.cum_hat = exp (lasso_predictions_vector), e.cum = exp (e.cum)) %>%

    mutate (e.rel_hat = e.cum_hat / tan.app) %>%

    select (pmid, time, e = e.cum_hat, er = e.rel_hat, truth_e = e.cum, truth_er = e.rel, dataset, country, app.mthd) 

Comparison

predictions_of_all_methods = rbind (
    alfam2_predictions %>% mutate (method = "alfam2"), 
    random_forest_predictions %>% mutate (method = "random forest"),
    xgboost_predictions %>% mutate (method = "xgboost"),
    lasso_predictions %>% mutate (method = "lasso")
)
predictions_of_all_methods %>% head
  pmid  time         e        er truth_e truth_er            dataset country
1  182 44.75 21.077637 0.1726119 11.8310 0.096885 Calibration subset      DK
2  183 45.20  7.305541 0.1252665  5.5152 0.094567 Calibration subset      DK
3  184 45.15 14.693149 0.1867362 11.2350 0.142780 Calibration subset      DK
4  185 45.50  7.945285 0.1262760  7.2675 0.115500 Calibration subset      DK
5  186 43.85 14.753793 0.2585346  9.1200 0.159810 Calibration subset      DK
6  187 45.25  8.523611 0.1827650  6.5170 0.139740 Calibration subset      DK
  app.mthd method
1       bc alfam2
2     bsth alfam2
3       bc alfam2
4     bsth alfam2
5       bc alfam2
6     bsth alfam2

Residuals

plot_residuals_complete_dataset = predictions_of_all_methods %>%
    mutate (residuals = truth_er - er) %>%
    mutate (app.mthd = recode (app.mthd, "bc" = "Broadcast", "bsth" = "Trailing hose", "ts" = "Trailing shoe", "os" = "Open slot injection")) %>%
    mutate (app.mthd = factor (app.mthd, levels = c ("Broadcast", "Trailing hose", "Trailing shoe", "Open slot injection"))) %>%
    ggplot () +
        geom_boxplot (aes (x = country, y = residuals, fill = method)) +
        geom_hline (yintercept = 0, linetype = 2) +
        scale_fill_manual (values = Dark2[c(2, 5, 6, 8)]) +
        facet_wrap (~ app.mthd, ncol = 1) +
        ylab ("Residual") +
        labs (fill = "") +
        theme (legend.position = "bottom",
               axis.title.y = element_text (margin = ggplot2::margin (r = 30)),
               strip.text.x = element_text(margin = ggplot2::margin(t = 8, b = 8, r = 0, l = 0)))

plot_residuals_complete_dataset

plot_residuals_evaluation_subset = predictions_of_all_methods %>%
    filter (dataset == "Evaluation subset") %>%
    mutate (residuals = truth_er - er) %>%
    mutate (app.mthd = recode (app.mthd, "bc" = "Broadcast", "bsth" = "Trailing hose", "ts" = "Trailing shoe", "os" = "Open slot injection")) %>%
    mutate (app.mthd = factor (app.mthd, levels = c ("Broadcast", "Trailing hose", "Trailing shoe", "Open slot injection"))) %>%
    ggplot () +
        geom_boxplot (aes (x = country, y = residuals, fill = method), varwidth = TRUE) +
        geom_hline (yintercept = 0, linetype = 2) +
        scale_fill_manual (values = Dark2[c(2, 5, 6, 8)]) +
        facet_wrap (~ app.mthd, ncol = 1) +
        ylab ("Residual") +
        labs (fill = "") +
        theme (legend.position = "bottom",
               axis.title.y = element_text (margin = ggplot2::margin (r = 30)),
               strip.text.x = element_text(margin = ggplot2::margin(t = 8, b = 8, r = 0, l = 0)))

plot_residuals_evaluation_subset

Observed vs predicted values

plot_observed_vs_predicted_values = predictions_of_all_methods %>% 

    filter (dataset == "Evaluation subset") %>%

    mutate (method = recode (method, "alfam2" = "A", "lasso" = "B", "random forest" = "C", "xgboost" = "D")) %>%
    
    ggplot () +

        geom_point (aes (x = truth_e, y = e, color = method), size = 3) +

        geom_abline (slope = 1, linetype = "dashed") +

        facet_wrap (~ method) +

        scale_color_manual (values = Dark2[c(2, 5, 6, 8)]) +

        labs (color = "") +

        theme (legend.position = "none",
               axis.title.y = element_text (margin = ggplot2::margin (r = 30)),
               axis.title.x = element_text (margin = ggplot2::margin (t = 15, b = 20)),
               strip.text.x = element_text(margin = ggplot2::margin(t = 8, b = 8, r = 0, l = 0))) +

        xlab ("Observed values") + ylab ("Predicted values") +

        NULL

plot_observed_vs_predicted_values

Evaluation metrics

df_evaluation_metrics = rbind (
    
    predictions_of_all_methods %>% 
        select (prediction = e, truth = truth_e, dataset, method) %>% 
        mutate (response = "72h cum. emission"),
    
    predictions_of_all_methods %>% 
        select (prediction = er, truth = truth_er, dataset, method) %>% 
        mutate (response = "72h relative cum. emission")
    
) %>%

    summarise (
      
        MSE = mean ((prediction - truth) ^ 2),
      
        RMSE = sqrt (mean ((prediction - truth) ^ 2)),
        
        Pearsons_r = cor (prediction, truth),

        ME = 1 - (sum ( (prediction - truth) ^ 2) / sum ( (truth - mean (truth)) ^ 2)),

        MAE = mean (abs (prediction - truth)),
        
        MBE = mean (prediction - truth),

        .by = c (dataset, method, response)

) %>% 

    mutate_if (is.numeric, round, digits = 3)

df_evaluation_metrics %>%  arrange (response, dataset) %>% kable ()
dataset method response MSE RMSE Pearsons_r ME MAE MBE
Calibration subset alfam2 72h cum. emission 113.864 10.671 0.823 0.610 6.644 -2.983
Calibration subset random forest 72h cum. emission 12.360 3.516 0.982 0.958 2.214 0.142
Calibration subset xgboost 72h cum. emission 0.535 0.731 0.999 0.998 0.260 -0.003
Calibration subset lasso 72h cum. emission 122.019 11.046 0.781 0.582 6.703 -2.561
Evaluation subset alfam2 72h cum. emission 65.292 8.080 0.860 0.618 5.647 -2.808
Evaluation subset random forest 72h cum. emission 20.339 4.510 0.942 0.881 3.279 0.924
Evaluation subset xgboost 72h cum. emission 38.335 6.192 0.895 0.776 4.094 0.507
Evaluation subset lasso 72h cum. emission 53.318 7.302 0.839 0.688 5.574 -1.381
Calibration subset alfam2 72h relative cum. emission 0.031 0.175 0.801 0.528 0.118 -0.056
Calibration subset random forest 72h relative cum. emission 0.003 0.054 0.979 0.956 0.038 0.006
Calibration subset xgboost 72h relative cum. emission 0.000 0.010 0.999 0.999 0.004 0.000
Calibration subset lasso 72h relative cum. emission 0.027 0.164 0.787 0.588 0.114 -0.043
Evaluation subset alfam2 72h relative cum. emission 0.023 0.151 0.793 0.581 0.114 -0.044
Evaluation subset random forest 72h relative cum. emission 0.011 0.103 0.915 0.806 0.076 0.033
Evaluation subset xgboost 72h relative cum. emission 0.014 0.116 0.886 0.752 0.084 0.008
Evaluation subset lasso 72h relative cum. emission 0.025 0.157 0.770 0.549 0.122 0.000
df_evaluation_metrics %>%
  
    filter (response == "72h cum. emission") %>%
  
    select (- MSE, - RMSE, - Pearsons_r, - ME) %>%

    pivot_longer (cols = c (MAE, MBE)) %>%

    arrange (name, dataset) %>%

    mutate (variation = (1 - abs(value) / abs(value [method == "alfam2"])) * 100, .by = c(name, dataset)) %>%
  
    kable ()
dataset method response name value variation
Calibration subset alfam2 72h cum. emission MAE 6.644 0.0000000
Calibration subset random forest 72h cum. emission MAE 2.214 66.6767008
Calibration subset xgboost 72h cum. emission MAE 0.260 96.0866948
Calibration subset lasso 72h cum. emission MAE 6.703 -0.8880193
Evaluation subset alfam2 72h cum. emission MAE 5.647 0.0000000
Evaluation subset random forest 72h cum. emission MAE 3.279 41.9337701
Evaluation subset xgboost 72h cum. emission MAE 4.094 27.5013281
Evaluation subset lasso 72h cum. emission MAE 5.574 1.2927218
Calibration subset alfam2 72h cum. emission MBE -2.983 0.0000000
Calibration subset random forest 72h cum. emission MBE 0.142 95.2396916
Calibration subset xgboost 72h cum. emission MBE -0.003 99.8994301
Calibration subset lasso 72h cum. emission MBE -2.561 14.1468320
Evaluation subset alfam2 72h cum. emission MBE -2.808 0.0000000
Evaluation subset random forest 72h cum. emission MBE 0.924 67.0940171
Evaluation subset xgboost 72h cum. emission MBE 0.507 81.9444444
Evaluation subset lasso 72h cum. emission MBE -1.381 50.8190883
# Evaluation on evalutation subset
df_plot = df_evaluation_metrics %>%
    filter (response == "72h cum. emission") %>%
    select (- MSE, - RMSE) %>%
    filter (dataset == "Evaluation subset") %>%
    pivot_longer (cols = c (Pearsons_r, ME, MAE, MBE)) %>%
    mutate (name = factor (name, levels = c("Pearsons_r", "ME", "MAE", "MBE")))

suppressWarnings(ggplot (df_plot) +
    geom_histogram (aes (x = name, y = value, fill = method), position = "dodge", stat = "identity") +
    facet_wrap (~ name, scales = "free", nrow = 1) + 
    xlab ("") + ylab ("") + labs (fill = "") +
    theme (legend.position = "none ", 
           strip.text.y = element_text(angle = 0), 
           axis.text.x = element_blank(), 
           axis.ticks.x = element_blank(),
           strip.text.x = element_text(margin = ggplot2::margin(t = 8, b = 8, r = 0, l = 0))) +
    scale_fill_manual (values = Dark2[c(2, 5, 6, 8)]) +
    ggtitle ("Evaluation subset") + theme (plot.title = element_text (margin= ggplot2::margin(0,0,20,0), hjust = 0, size = 30)))

# Evaluation on calibration subset
df_plot = df_evaluation_metrics %>%
    filter (response == "72h cum. emission") %>%
    select (- MSE, - RMSE) %>%
    filter (dataset == "Calibration subset") %>%
    pivot_longer (cols = c (Pearsons_r, ME, MAE, MBE)) %>%
    mutate (name = factor (name, levels = c("Pearsons_r", "ME", "MAE", "MBE")))

suppressWarnings(ggplot (df_plot) +
    geom_histogram (aes (x = name, y = value, fill = method), position = "dodge", stat = "identity") +
    facet_wrap (~ name, scales = "free", nrow = 1) + 
    xlab ("") + ylab ("") + labs (fill = "") +
    theme (legend.position = "bottom", 
           strip.text.y = element_text(angle = 0), 
           axis.text.x = element_blank(), 
           axis.ticks.x = element_blank(),
           strip.text.x = element_text(margin = ggplot2::margin(t = 8, b = 8, r = 0, l = 0))) +
    scale_fill_manual (values = Dark2[c(2, 5, 6, 8)]) +
    ggtitle ("Calibration subset") + theme (plot.title = element_text (margin= ggplot2::margin(0,0,20,0), hjust = 0, size = 30)))

Part 2 - New training with the whole dataset

Random forest

set.seed (123)
random_forest_model = randomForest (
    
        e.cum ~ ., 

        data = data_random_forest %>% select (- pmid, - e.rel, - dataset, - country),

        importance = TRUE,
    
        mtry = 19, nodesize = 3, replace = FALSE, sample_frac = 0.8
    
)

Shapley

df_tmp = data_random_forest_calibration %>% select (- e.cum)

unified_model_rf <- treeshap::randomForest.unify (random_forest_model, df_tmp)

treeshap_rf <- treeshap::treeshap (unified_model_rf, df_tmp, verbose = FALSE)

shapley_rf <- shapviz (treeshap_rf, X = data_random_forest_calibration %>% select (- e.cum))
plot_shap_rf = shapviz::sv_importance (shapley_rf, kind = "beeswarm", max_display = 10L) +
    scale_colour_gradient (low = "darkblue", high = "red", breaks = c(0, 1), labels = c("low", "high")) +
    theme (
      legend.title = element_text (size = 25, margin = ggplot2::margin (r = 50, b = 100)),
      legend.position = "bottom",
      legend.key.width = unit(3, "cm"),
      axis.title.x = element_text (margin = ggplot2::margin( b = 30))
    )

plot_shap_rf

Gain

df_importance_rf = randomForest::importance (random_forest_model) %>%
    as.data.frame %>%
    rownames_to_column (var = "Variable") %>%
    mutate (`%IncMSE` = `%IncMSE` / max (`%IncMSE`)) %>%
    mutate (type = "A")

plot_importance_rf = ggplot (df_importance_rf) + 
        geom_point (aes (x = `%IncMSE`, y = Variable), size = 5) +
        scale_y_discrete (limits = df_importance_rf %>% arrange (`%IncMSE`) %>% pull (Variable)) +
        theme (strip.text.x = element_text(margin = ggplot2::margin(t = 8, b = 8, r = 0, l = 0))) +
        xlab ("Importance") +
        ylab ("") +
        facet_wrap (~ type)

plot_importance_rf

df_importance_rf %>%
    arrange (desc (`%IncMSE`)) %>%
    head (n = 10) %>% kable ()
Variable %IncMSE IncNodePurity type
app.mthd 1.0000000 18855.0746 A
tan.app 0.6135884 18809.3852 A
man.ph 0.5491638 13928.9417 A
incorp 0.4257513 6180.6429 A
t.incorp 0.3345969 4605.3732 A
man.dm 0.3251821 5602.9474 A
app.rate 0.2350342 4444.9184 A
temp_6 0.2256594 1241.4233 A
rain_6 0.2081427 1181.8105 A
rain_1 0.1985322 371.3292 A

Xgboost

set.seed (123)
xgboost_model = xgboost (  

    data = xgb.DMatrix (

        data = data_xgboost %>% 
            select (- pmid, - e.cum, - e.rel, - dataset, - country) %>%
            as.matrix %>%
            {.}, 

        label = data_xgboost %>% 
            select (e.cum) %>% 
            as.matrix %>%
            {.}

    ),

    max.depth = 6, nrounds = 300, eta = 0.3, min_child_weight = 0.5, subsample = 0.8,

    verbose = FALSE,
    
    objective = "reg:squarederror"

)

Shapley

df_tmp = data_xgboost %>% select (- pmid, - e.cum, - e.rel, - dataset, - country)

unified_model_xgboost <- treeshap::xgboost.unify (xgboost_model, as.matrix(df_tmp))

treeshap_xgboost <- treeshap::treeshap (unified_model_xgboost, df_tmp, verbose = FALSE)

shapley_xgb <- shapviz::shapviz (treeshap_xgboost, X = df_tmp)
plot_shap_xgboost = shapviz::sv_importance (shapley_xgb, kind = "beeswarm", max_display = 10L) +
  scale_colour_gradient (low = "darkblue", high = "red", breaks = c(0, 1), labels = c("low", "high")) +
  theme (legend.title = element_text (size = 30, margin = ggplot2::margin (r = 50, b = 100)),
         legend.position = "bottom",
         legend.key.width = unit(3, "cm"),
         axis.title.x = element_text (margin = ggplot2::margin( b = 30)))

plot_shap_xgboost

Gain

df_importance_xgb = xgboost::xgb.importance (model = xgboost_model) %>%
    mutate (Gain = Gain / max (Gain)) %>%
    mutate (type = "B")

plot_importance_xgboost = df_importance_xgb %>%

    ggplot() +
        geom_point (aes (x = Gain, y = Feature), size = 5) +
        scale_y_discrete (limits = df_importance_xgb %>% arrange (Gain) %>% pull (Feature)) +
        theme (strip.text.x = element_text(margin = ggplot2::margin(t = 8, b = 8, r = 0, l = 0))) +
        xlab ("Importance") + ylab ("") +
        facet_wrap (~ type)

plot_importance_xgboost

df_importance_xgb %>%
    arrange (desc (Gain)) %>%
    head (n = 10) %>% kable ()
Feature Gain Cover Frequency type
app.mthd 1.0000000 0.0144249 0.0210013 B
tan.app 0.9913541 0.0607105 0.1204637 B
man.ph 0.7498047 0.1183736 0.0562836 B
incorp 0.4445915 0.0048873 0.0189852 B
man.dm 0.2828499 0.0617338 0.0601478 B
app.rate 0.1648063 0.0317940 0.0460349 B
time 0.1421372 0.1771188 0.2436156 B
temp_1 0.1310175 0.0494286 0.0443548 B
temp_2 0.0701466 0.0453213 0.0431788 B
temp_4 0.0579904 0.0107690 0.0120968 B

Lasso

set.seed (123)
lasso_model = cv.glmnet (  

    x = data_lasso %>% 
        select (- pmid, - e.cum, - e.rel, - dataset, - country) %>%
        as.matrix %>%
        {.}, 

    y = data_lasso %>% 
        select (e.cum) %>% 
        as.matrix %>%
        {.},

    alpha = 1

)

Part 3 - Testing scenarios

load (file = "scripts/processed_data/data_scenarios.Rdata")

data_scenarios = data_scenarios %>%

    mutate (t.incorp = as.factor (t.incorp)) %>%
    mutate (t.incorp = recode (t.incorp, "1000" = "-"))

Random forest

load (file = "scripts/processed_data/data_scenarios_random_forest.Rdata")

data_tmp = data_scenarios_random_forest

random_forest_predictions_scenarios_vector = predict (
        random_forest_model,       
        newdata = rbind (data_random_forest_calibration[1, -1], data_tmp)[- 1, ]
)

data_scenarios = data_scenarios %>% 

    mutate (e.cum_hat_random_forest = random_forest_predictions_scenarios_vector, .before = time) %>%

    mutate (efficacy_random_forest = ((e.cum_hat_random_forest / e.cum_hat_random_forest[app.mthd == "bc" & incorp == "none" & t.incorp == "-"]) - 1) * 100, 
            .by = c (tan.app, app.rate, man.dm, man.ph, man.source, group_temp, group_wind, group_rain)) %>% 

    {.}

Xgboost

load (file = "scripts/processed_data/data_scenarios_xgboost.Rdata")

xgboost_predictions_scenarios_vector = predict (
        
    xgboost_model, 

    data_scenarios_xgboost %>% as.matrix
)

data_scenarios = data_scenarios %>% 

    mutate (e.cum_hat_xgboost = xgboost_predictions_scenarios_vector, .before = time) %>%

    mutate (efficacy_xgboost = ((e.cum_hat_xgboost / e.cum_hat_xgboost[app.mthd == "bc" & incorp == "none" & t.incorp == "-"]) - 1) * 100, 
            .by = c (tan.app, app.rate, man.dm, man.ph, man.source, group_temp, group_wind, group_rain))

Lasso

load (file = "scripts/processed_data/data_scenarios_lasso.Rdata")

lasso_predictions_scenarios_vector = predict (
        
    lasso_model, 

    data_scenarios_lasso %>% as.matrix
)


data_scenarios = data_scenarios %>% 

    mutate (e.cum_hat_lasso = lasso_predictions_scenarios_vector, .before = time) %>%

    mutate (efficacy_lasso = ((e.cum_hat_lasso / e.cum_hat_lasso[app.mthd == "bc" & incorp == "none"]) - 1) * 100, 
            .by = c (tan.app, app.rate, man.dm, man.ph, man.source, group_temp, group_wind, group_rain))

Alfam2

load (file = "scripts/processed_data/data_scenarios_alfam2.Rdata")

alfam2_predictions_scenarios_vector =  alfam2 (
        pars = alfam2pars01,
        dat = data_scenarios_alfam2,
        app.name = "tan.app",
        time.name = "time",
        time.incorp = "t.incorp",
        group = "pmid",
        prep = TRUE,
        warn = FALSE
)  %>%

    filter (time == max (time), .by = pmid) %>%
    select (pmid, e.cum_hat_alfam2 = e)

data_scenarios = data_scenarios %>%

    left_join (alfam2_predictions_scenarios_vector, by = "pmid") %>%

    mutate (efficacy_alfam2 = ((e.cum_hat_alfam2 / e.cum_hat_alfam2[app.mthd == "bc" & incorp == "none" & t.incorp == "-"]) - 1) * 100, 
            .by = c (tan.app, app.rate, man.dm, man.ph, man.source, group_temp, group_wind, group_rain))

Comparison

plot_scenario_predictions = data_scenarios %>% 
    select (app.mthd, incorp, t.incorp, man.ph,
            efficacy_random_forest, efficacy_xgboost, efficacy_lasso, efficacy_alfam2) %>%

    rename (efficacy_rforest = efficacy_random_forest) %>%

    pivot_longer (cols = c (5 : 8), names_to = c (".value", "method"), names_pattern = "(.+)_(.+)") %>%
    mutate (method = recode (method, "rforest" = "random forest")) %>%

    mutate (method = recode (method, "alfam2" = "A", "lasso" = "B", "random forest" = "C", "xgboost" = "D")) %>%

    mutate (group = paste (app.mthd, incorp, t.incorp)) %>%
    filter (! (app.mthd == "bc" & incorp == "none")) %>%

    mutate (group = recode (group,
                            "bc shallow 0" = "incorporation",
                            "bsth none -" = "trailing hose",
                            "os none -" = "open slot",
                            "ts none -" = "trailing shoe")) %>%
    
    ggplot () +
        
        geom_boxplot (aes (x = efficacy, y = group, fill = group)) +
        scale_fill_manual (values = Dark2[c(2 : 5)]) +
        ylab ("") + xlab ("Efficacy") +
        theme (legend.position = "none",
               axis.ticks.y = element_blank(),
               strip.text.x = element_text(margin = ggplot2::margin(t = 8, b = 8, r = 0, l = 0))) +
        facet_wrap (~ method, ncol = 1)

plot_scenario_predictions

data_scenarios %>%
    filter (efficacy_xgboost > 0) %>%
    select (efficacy_xgboost, tan.app, app.mthd, app.rate, man.source, incorp, group_temp, group_wind, group_rain) %>%
    kable ()
efficacy_xgboost tan.app app.mthd app.rate man.source incorp group_temp group_wind group_rain
8.9906385 80.3025 ts 36.650 pig none q1 q1 q1
9.0057694 80.3025 ts 36.650 cat none q1 q1 q1
0.5613908 80.3025 bsth 18.725 pig none q1 q1 q2
0.5621355 80.3025 bsth 18.725 cat none q1 q1 q2
25.0445690 80.3025 bsth 36.650 pig none q1 q1 q2
18.2393264 80.3025 os 36.650 pig none q1 q1 q2
32.8066167 80.3025 ts 36.650 pig none q1 q1 q2
25.0855518 80.3025 bsth 36.650 cat none q1 q1 q2
18.2691620 80.3025 os 36.650 cat none q1 q1 q2
32.8602812 80.3025 ts 36.650 cat none q1 q1 q2
30.7067041 80.3025 bsth 36.650 pig none q1 q2 q2
23.4712954 80.3025 os 36.650 pig none q1 q2 q2
25.4853738 80.3025 ts 36.650 pig none q1 q2 q2
30.7704933 80.3025 bsth 36.650 cat none q1 q2 q2
23.5200540 80.3025 os 36.650 cat none q1 q2 q2
25.5383165 80.3025 ts 36.650 cat none q1 q2 q2
4.5797139 36.6595 ts 36.650 pig none q2 q1 q1
4.5901176 36.6595 ts 36.650 cat none q2 q1 q1
11.1164005 80.3025 ts 36.650 pig none q2 q1 q1
11.4099749 80.3025 ts 36.650 cat none q2 q1 q1
9.3674813 36.6595 ts 36.650 pig none q2 q1 q2
9.3912529 36.6595 ts 36.650 cat none q2 q1 q2
10.4259527 80.3025 bsth 18.725 pig none q2 q1 q2
7.6296711 80.3025 os 18.725 pig none q2 q1 q2
14.5180492 80.3025 ts 18.725 pig none q2 q1 q2
10.6716109 80.3025 bsth 18.725 cat none q2 q1 q2
7.8093910 80.3025 os 18.725 cat none q2 q1 q2
14.8600869 80.3025 ts 18.725 cat none q2 q1 q2
15.0486933 80.3025 bsth 36.650 pig none q2 q1 q2
11.4423307 80.3025 os 36.650 pig none q2 q1 q2
32.0733336 80.3025 ts 36.650 pig none q2 q1 q2
15.4434423 80.3025 bsth 36.650 cat none q2 q1 q2
11.7424345 80.3025 os 36.650 cat none q2 q1 q2
32.9145756 80.3025 ts 36.650 cat none q2 q1 q2
8.9087332 80.3025 ts 36.650 pig none q2 q2 q1
9.0807629 80.3025 ts 36.650 cat none q2 q2 q1
2.5264251 80.3025 bsth 18.725 pig none q2 q2 q2
0.7573102 80.3025 os 18.725 pig none q2 q2 q2
2.5725532 80.3025 bsth 18.725 cat none q2 q2 q2
0.7710910 80.3025 os 18.725 cat none q2 q2 q2
30.5988491 80.3025 bsth 36.650 pig none q2 q2 q2
27.8218260 80.3025 os 36.650 pig none q2 q2 q2
33.8994959 80.3025 ts 36.650 pig none q2 q2 q2
31.3037845 80.3025 bsth 36.650 cat none q2 q2 q2
28.4628769 80.3025 os 36.650 cat none q2 q2 q2
34.6804699 80.3025 ts 36.650 cat none q2 q2 q2
data_scenarios %>%
    filter (efficacy_xgboost > 0) %>%
    select (tan.app, app.mthd, app.rate, man.source, incorp, group_temp, group_wind, group_rain) %>%
    lapply (table)
$tan.app

36.6595 80.3025 
      4      42 

$app.mthd

bsth   os   ts 
  14   12   20 

$app.rate

18.725  36.65 
    12     34 

$man.source

cat pig 
 23  23 

$incorp

none 
  46 

$group_temp

q1 q2 
16 30 

$group_wind

q1 q2 
28 18 

$group_rain

q1 q2 
 8 38 
data_tmp = data_scenarios %>%
    filter (efficacy_xgboost > 0) %>%
    select (efficacy_xgboost, tan.app, app.mthd, app.rate, man.source, incorp, group_temp, group_wind, group_rain) %>%
    mutate (group_rain = ifelse (group_rain == "q1", "low", "high")) %>%
    mutate (tan.app = paste ("TAN = ", tan.app),
            app.rate = paste ("Application rate = ", app.rate),
            group_rain = paste ("Rain = ", group_rain))

plot = ggplot (data_tmp) +
    geom_histogram (aes (efficacy_xgboost)) +
    facet_wrap (~ tan.app + app.rate + group_rain) +
    xlab ("Efficacy") +
    ylab ("Number of predictions") +
    theme (axis.title.y = element_text (margin = ggplot2::margin (r = 20)),
           strip.text.x = element_text(margin = ggplot2::margin(t = 8, b = 8, r = 0, l = 0)),
           strip.text = element_text (size = 20))

plot

data_scenarios %>%
  filter (! (app.mthd == "bc" & incorp == "none")) %>%
  summarise (mean_rf = mean (efficacy_random_forest), 
             mean_alfam2 = mean (efficacy_alfam2), 
             mean_gbf = mean (efficacy_xgboost), 
             mean_lasso = mean (efficacy_lasso), .by = c ('app.mthd')) %>%
  kable()
app.mthd mean_rf mean_alfam2 mean_gbf mean_lasso
bsth -52.11648 -34.29654 -38.60492 -23.72561
os -52.75687 -61.75070 -41.86650 -42.49275
ts -50.90685 -34.29654 -34.98651 -23.72561
bc -16.64187 -43.07410 -34.74081 -21.99013

Figures

legend = cowplot::get_legend (plot_shap_rf)

plot_shap_rf_2 = plot_shap_rf +
    annotate (geom = "label", label = "A", size = 22, fontface = "bold", x = 45, y = 1.5, fill = "#000080", color = "white") +
    xlim (c (- 18, 45)) +
    theme (legend.position = "none", 
          plot.margin = unit (c (0, 0, 1, 0), "cm"),
          axis.text.x = element_blank(),
          axis.title.x = element_blank(),
          axis.ticks.x = element_blank())

plot_shap_xgboost_2 = plot_shap_xgboost +
    annotate (geom = "label", label = "B", size = 22, fontface = "bold", x = 45, y = 1.5, fill = "#000080", color = "white") +
    xlim (c (- 18, 45)) + 
    theme (legend.position = "none", 
          plot.margin = unit (c (0, 0, 1, 0), "cm"),
          axis.title.x = element_text (margin = ggplot2::margin (t = 10, b = 20)))


png (file = "figures/plot_shap.png", width = 1000, height = 900)
do.call ("grid.arrange", 
         c (list(plot_shap_rf_2, 
                 plot_shap_xgboost_2, 
                 legend), 
            list(layout_matrix = as.matrix(c (1, 2, 3)),
                  heights = c (0.8, 1, 0.2))))
dev.off()
png 
  2 
png (file = "figures/plot_importance.png", width = 1300, height = 800)
grid.arrange (plot_importance_rf, plot_importance_xgboost, nrow = 1)
dev.off()
png 
  2 

Supplementary material

data = read_excel("data/Peng_Xu_et_al_2024/Supplementary_Table_3_use2775.xlsx", sheet = 2)
head (data) %>% select (- Title) %>% kable()
No. Author Language Year Replicates Fertilizer type Nitrogen placement Fertilizer application time Soil tillage practices Crop type Water input Tem SOC TN pH BD Clay CEC Nrate Erate
1 Lin et al. (2015) Chinese 2013 3 U SBC 1 CT Rice 1289 26.0 5.4 0.6 7.9 1.3 20.0 18.0 113 22.1
1 Lin et al. (2015) Chinese 2013 3 U SBC 1 CT Rice 1289 26.0 5.4 0.6 7.9 1.3 20.0 18.0 150 32.7
1 Lin et al. (2015) Chinese 2013 3 U SBC 1 CT Rice 1289 26.0 5.4 0.6 7.9 1.3 20.0 18.0 188 35.9
1 Lin et al. (2015) Chinese 2013 3 U SBC 1 CT Rice 1289 26.0 5.4 0.6 7.9 1.3 20.0 18.0 225 44.6
1 Lin et al. (2015) Chinese 2013 3 U SBC 1 CT Rice 1289 26.0 5.4 0.6 7.9 1.3 20.0 18.0 300 63.1
2 Zhu et al. (2013) Chinese 2012 3 Others Mix 2 CT Rice 1500 25.1 21.9 1.9 5.8 1.1 38.3 11.4 113 38.3
plot_temperature_comparison = rbind (
    data %>% select (temp = Tem) %>% mutate (data = "Data from Peng Xu et al (2024)"),
    data_alfam2 %>% select (temp = air.temp) %>% mutate (data = "Data used for the alfam2 model")
) %>%

    ggplot () +
        geom_histogram (aes (x = temp)) +
        facet_wrap (~ data, ncol = 1, scales = "free") +
        theme (strip.text.x = element_text(margin = ggplot2::margin(t = 8, b = 8, r = 0, l = 0)))+
        xlab ("Air temperature (°C)") +
        ylab ("Number of observations")

plot_temperature_comparison

rbind (
    data %>% select (temp = Tem) %>% mutate (data = "Data from Peng Xu et al (2024)"),
    data_alfam2 %>% select (temp = air.temp) %>% mutate (data = "Data used for the alfam2 model")
) %>%
  
  summarise (mean_temp = mean (temp), sd_temp = sd (temp), .by = data)
# A tibble: 2 × 3
  data                           mean_temp sd_temp
  <chr>                              <dbl>   <dbl>
1 Data from Peng Xu et al (2024)      20.7    6.86
2 Data used for the alfam2 model      13.3    5.48